library(C50)
library(tidyverse)
library(tidymodels)
library(janitor)
library(skimr)
library(kableExtra)
library(GGally)
library(vip)
library(fastshap)
library(MASS)
library(rpart.plot)
library(factoextra)
library(imputeMissings)
library(ISLR)
library(tree)
library(clock)
library(ggplot2)
library(dplyr)
library(lubridate)
library(corrplot)

Import

donor<-read_csv("C:/Users/yuxia/Downloads/DonorMerge_Final (1).csv")%>%clean_names()%>%
  mutate(days_from_now = as.numeric(Sys.Date()-mdy(date_posted)))%>%
  dplyr::select(-date_posted)
donation<-read_csv("C:/Users/yuxia/Downloads/Donations (1).csv")%>%clean_names()

##data preparation

# To find the missing values
donor%>%select_if(is.factor)%>%skim()
donor%>%select_if(is.numeric)%>%skim()
Data summary
Name Piped data
Number of rows 328018
Number of columns 12
_______________________
Column type frequency:
numeric 12
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
school_ncesid 21067 0.94 2.606752e+11 1.590522e+11 1.00005e+10 9.0495e+10 2.6100e+11 3.7000e+11 6.100000e+11 ▇▃▆▅▁
school_latitude 0 1.00 3.724000e+01 4.660000e+00 1.82500e+01 3.4040e+01 3.7670e+01 4.0760e+01 6.726000e+01 ▁▇▇▁▁
school_longitude 0 1.00 -9.344000e+01 1.781000e+01 -1.71690e+02 -1.1205e+02 -8.7690e+01 -7.9150e+01 -6.663000e+01 ▁▁▅▅▇
school_zip 2 1.00 5.310286e+04 3.123205e+04 4.10000e+02 2.7589e+04 5.3089e+04 8.5225e+04 9.995000e+04 ▆▇▂▆▇
great_messages_proportion 89712 0.73 5.482000e+01 3.507000e+01 0.00000e+00 3.3000e+01 5.7000e+01 8.4000e+01 1.000000e+02 ▆▃▆▆▇
teacher_referred_count 47338 0.86 9.300000e-01 2.350000e+00 0.00000e+00 0.0000e+00 0.0000e+00 1.0000e+00 1.250000e+02 ▇▁▁▁▁
non_teacher_referred_count 47338 0.86 4.580000e+00 5.910000e+00 0.00000e+00 1.0000e+00 3.0000e+00 5.0000e+00 3.040000e+02 ▇▁▁▁▁
fulfillment_labor_materials 17492 0.95 2.762000e+01 8.810000e+00 9.00000e+00 1.7000e+01 3.0000e+01 3.5000e+01 3.500000e+01 ▁▂▁▁▇
total_price_excluding_optional_s 0 1.00 5.510900e+02 1.801938e+04 0.00000e+00 2.6817e+02 4.1058e+02 5.7910e+02 1.025002e+07 ▇▁▁▁▁
total_price_including_optional_s 0 1.00 6.561900e+02 2.197364e+04 0.00000e+00 3.1902e+02 4.8750e+02 6.8816e+02 1.250002e+07 ▇▁▁▁▁
students_reached 82 1.00 9.960000e+01 2.733940e+03 0.00000e+00 2.2000e+01 3.0000e+01 1.0000e+02 9.999990e+05 ▇▁▁▁▁
days_from_now 0 1.00 4.315460e+03 7.992100e+02 3.26400e+03 3.6870e+03 4.1530e+03 4.8120e+03 7.391000e+03 ▇▆▂▁▁
donation%>%select_if(is.factor)%>%skim()
donation%>%select_if(is.numeric)%>%skim()
Data summary
Name Piped data
Number of rows 1048575
Number of columns 4
_______________________
Column type frequency:
numeric 4
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
donor_zip 580554 0.45 50667.01 33285.41 0.00 20003.0 46902.00 89135.00 99999 ▇▆▂▅▇
donation_to_project 8 1.00 65.75 215.43 -11.80 10.0 21.25 50.00 85000 ▇▁▁▁▁
donation_optional_support 8 1.00 10.70 32.97 -0.02 1.5 3.75 7.50 15000 ▇▁▁▁▁
donation_total 8 1.00 76.45 243.75 -11.80 10.0 25.00 56.47 100000 ▇▁▁▁▁

explore target

donor %>%
  count(is_exciting) %>%
  mutate(pct = n/sum(n))

explore numeric variables

donormerge <- donor%>%
  filter(total_price_excluding_optional_s <1000,total_price_including_optional_s<1000,students_reached <250,teacher_referred_count<5)
boxplot<-function(m){
  ggplot(donormerge,aes(x=!!as.name(m),y=as.factor(is_exciting),fill=as.factor(is_exciting)))+
  geom_boxplot()+
  labs(title = as.character(m),y='exciting')}
numerics <- c('school_latitude','school_longitude','great_messages_proportion','total_price_excluding_optional_s','total_price_including_optional_s','students_reached','days_from_now','teacher_referred_count')

for (c in numerics){print(boxplot(c))}

## Warning: Removed 35086 rows containing non-finite values (stat_boxplot).

we can see from the graph that exciting projects have a much higher proportion of unique comments on a page.

explore categorical variables

char_fill <- function(col){
  donor %>%
  na.omit()%>%
    ggplot(aes(!!as.name(col),fill=as.factor(is_exciting)))+
    geom_bar(position = 'fill')+
    coord_flip()+
    labs(y='proportion')
}
dummy <- c('school_city','school_state','school_metro','school_district','school_county','teacher_prefix','primary_focus_subject','primary_focus_area','secondary_focus_subject','secondary_focus_area','resource_type','poverty_level','grade_level','is_exciting','one_non_teacher_referred_donor_g','school_charter','school_magnet','school_year_round','school_nlns','school_kipp','school_charter_ready_promise','teacher_teach_for_america','teacher_ny_teaching_fellow','eligible_double_your_impact_matc','eligible_almost_home_match')

for (column in dummy){print(char_fill(column))}

## explore the numeric variables (donation)

donation %>% filter(donation_to_project<200)%>% 
  ggplot(aes(x=donation_to_project))+ geom_histogram(binwidth=10)

donation %>% filter(donation_optional_support<200)%>% 
  ggplot(aes(x=donation_optional_support))+ geom_histogram(binwidth=10)

donation %>% filter(donation_total<200)%>% 
  ggplot(aes(x=donation_total))+ geom_histogram(binwidth=10)

## explore character variables

bar <- function(col){
  donation %>%
    na.omit()%>%
    ggplot(aes(!!as.name(col)))+
    geom_bar()+
    coord_flip()+
    labs(y='count')
}
dummy <- c('is_teacher_acct','dollar_amount','donation_included_optional_support','payment_method','payment_included_acct_credit','payment_included_campaign_gift_card','payment_included_web_purchased_gift_card','payment_was_promo_matched','via_giving_page','for_honoree')

for (column in dummy){print(bar(column))}

we can predict from these chart that: 1. some variables has relatively low correlation with the donors: donation was made for an honoree,donation was matched 1-1 with corporate funds,a portion of a donation included corporate sponsored gift card high: donor is a teacher;used accounts credit redemption;included corporate sponsored gift card;donation was given through a giving / campaign page ## Prepare for clustering

# Remove redundant variables and target variable
donation%>%dplyr::select(-donationid,-projectid,-donation_timestamp,-donor_acctid,-donor_city,-donor_zip,-donor_state)->cluster

# Create dummy variables

cluster$is_under_10 <- as.factor(ifelse(cluster$dollar_amount == 'under_10', 1, 0))
cluster$is_10_to_100 <- as.factor(ifelse(cluster$dollar_amount == '10_to_100', 1, 0))
cluster$is_100_and_upn <- as.factor(ifelse(cluster$dollar_amount == '100_and_up', 1, 0))

cluster$no_cash_received <- as.factor(ifelse(cluster$payment_method == 'no_cash_received', 1, 0))
cluster$paypal <- as.factor(ifelse(cluster$payment_method == 'paypal', 1, 0))
cluster$creditcard <- as.factor(ifelse(cluster$payment_method == 'creditcard', 1, 0))
cluster$amazon <- as.factor(ifelse(cluster$payment_method == 'amazon', 1, 0))
cluster$double_your_impact_match <- as.factor(ifelse(cluster$payment_method == 'double_your_impact_match', 1, 0))
cluster$promo_code_match <- as.factor(ifelse(cluster$payment_method == 'promo_code_match', 1, 0))
cluster$check <- as.factor(ifelse(cluster$payment_method == 'check', 1, 0))
cluster$almost_home_match <- as.factor(ifelse(cluster$payment_method == 'almost_home_match', 1, 0))

cluster$is_teacher_acct <- as.factor(ifelse(cluster$is_teacher_acct == 'TRUE', 1, 0))
cluster$is_donation_included_optional_support <- as.factor(ifelse(cluster$donation_included_optional_support == 'TRUE', 1, 0))
cluster$is_payment_included_acct_credit <- as.factor(ifelse(cluster$payment_included_acct_credit == 'TRUE', 1, 0))
cluster$is_payment_included_campaign_gift_card <- as.factor(ifelse(cluster$payment_included_campaign_gift_card == 'TRUE', 1, 0))
cluster$is_payment_included_web_purchased_gift_card <- as.factor(ifelse(cluster$payment_included_web_purchased_gift_card == 'TRUE', 1, 0))
cluster$is_payment_was_promo_matched <- as.factor(ifelse(cluster$payment_was_promo_matched == 'TRUE', 1, 0))
cluster$is_via_giving_page <- as.factor(ifelse(cluster$via_giving_page == 'TRUE', 1, 0))
cluster$is_for_honoree <- as.factor(ifelse(cluster$for_honoree == 'TRUE', 1, 0))

# Remove rejected variables
cluster%>%dplyr::select(-dollar_amount,-payment_method,-donation_message,-is_teacher_acct,-donation_included_optional_support,-payment_included_acct_credit,-payment_included_web_purchased_gift_card,-payment_was_promo_matched,-via_giving_page,-for_honoree,-payment_included_campaign_gift_card,-payment_included_campaign_gift_card,-payment_included_campaign_gift_card)->cluster

# Standardize numeric variables
for(col in colnames(cluster%>% select_if(is.numeric))){
  cluster[, ncol(cluster) + 1] <-  scale(cluster[col])
  names(cluster)[ncol(cluster)] <- paste0(col,'_s')
  cluster<-cluster%>%dplyr::select(-col)
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(col)` instead of `col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
cluster%>%mutate_if(is.factor,as.character)%>%mutate_if(is.character,as.numeric)->cluster

head(cluster)

Cluster

# Choose number of clusters
cluster_sample <- cluster %>%
  sample_n(5000)
fviz_nbclust(cluster_sample,kmeans, method="wss")

set.seed(30)
clusters <- kmeans(cluster_sample,5,iter.max = 500,nstart = 20)
print(clusters)
## K-means clustering with 5 clusters of sizes 1933, 183, 22, 2068, 794
## 
## Cluster means:
##   is_under_10 is_10_to_100 is_100_and_upn no_cash_received      paypal
## 1   0.1350233    0.8649767    0.000000000        0.0000000 0.187790998
## 2   0.0000000    0.0000000    1.000000000        0.5409836 0.027322404
## 3   0.0000000    0.0000000    1.000000000        0.6818182 0.000000000
## 4   0.1939072    0.8041586    0.001934236        0.9956480 0.003868472
## 5   0.0000000    0.0000000    1.000000000        0.2959698 0.050377834
##   creditcard      amazon double_your_impact_match promo_code_match       check
## 1  0.5690636 0.058975685               0.01034661       0.16761511 0.005690636
## 2  0.1967213 0.000000000               0.10382514       0.00000000 0.071038251
## 3  0.0000000 0.000000000               0.04545455       0.00000000 0.181818182
## 4  0.0000000 0.000483559               0.00000000       0.00000000 0.000000000
## 5  0.3438287 0.027707809               0.17380353       0.05793451 0.022670025
##   almost_home_match is_donation_included_optional_support
## 1      0.0005173306                             0.9146405
## 2      0.0601092896                             0.9836066
## 3      0.0909090909                             0.9545455
## 4      0.0000000000                             0.8984526
## 5      0.0277078086                             0.9030227
##   is_payment_included_acct_credit is_payment_included_campaign_gift_card
## 1                      0.01345059                             0.01810657
## 2                      0.38251366                             0.07650273
## 3                      0.40909091                             0.13636364
## 4                      0.15764023                             0.75870406
## 5                      0.13853904                             0.12846348
##   is_payment_included_web_purchased_gift_card is_payment_was_promo_matched
## 1                                 0.002069322                   0.18986032
## 2                                 0.098360656                   0.00000000
## 3                                 0.136363636                   0.00000000
## 4                                 0.091392650                   0.00000000
## 5                                 0.051637280                   0.05541562
##   is_via_giving_page is_for_honoree donation_to_project_s
## 1          0.4899121    0.019141231            -0.1878061
## 2          0.3442623    0.027322404             1.9389363
## 3          0.4090909    0.000000000             7.0112678
## 4          0.1542553    0.007736944            -0.2226170
## 5          0.3916877    0.039042821             0.3786486
##   donation_optional_support_s donation_total_s
## 1                  -0.1980801       -0.1926735
## 2                   2.2576854        2.0190359
## 3                   7.5122981        7.2127789
## 4                  -0.2388145       -0.2290560
## 5                   0.3874131        0.3870553
## 
## Clustering vector:
##    [1] 1 1 4 5 4 1 1 5 1 4 1 4 5 4 2 5 4 5 1 5 1 5 4 1 4 5 4 1 4 1 4 4 1 4 4 4 1
##   [38] 4 1 1 5 2 1 4 4 4 4 4 1 4 4 1 4 4 1 5 4 4 1 4 2 4 1 4 5 1 1 4 4 1 1 1 1 4
##   [75] 4 4 1 1 4 1 4 4 4 4 1 4 4 4 1 4 5 4 1 4 1 5 1 1 1 4 1 2 4 4 4 1 4 1 1 1 4
##  [112] 1 5 1 2 1 4 1 4 5 1 4 1 5 1 1 1 1 4 4 5 4 5 5 2 1 4 4 4 1 1 1 4 1 2 5 1 4
##  [149] 4 2 4 4 1 1 5 4 4 4 1 4 4 4 5 4 5 4 4 4 1 1 1 4 4 1 4 1 4 1 5 1 1 1 5 2 4
##  [186] 1 5 1 4 4 1 5 1 4 1 1 1 2 4 1 1 1 4 1 4 4 1 1 4 2 4 4 5 4 1 1 4 2 1 4 4 1
##  [223] 4 4 1 4 5 4 5 1 4 4 1 1 1 1 4 4 4 1 4 4 1 1 4 1 5 5 5 4 4 4 1 4 4 1 1 1 1
##  [260] 1 4 1 5 5 4 5 1 5 1 1 4 4 5 5 1 1 1 1 1 1 1 2 4 5 4 1 1 4 4 5 1 4 1 4 1 5
##  [297] 4 4 1 4 4 5 1 1 1 5 1 1 1 4 4 4 2 4 1 1 1 4 1 5 4 4 1 4 4 4 4 1 4 1 4 4 4
##  [334] 1 1 4 5 1 5 4 5 4 5 1 1 1 4 4 1 4 1 4 1 4 4 1 2 5 2 5 5 1 4 1 5 1 4 5 4 1
##  [371] 4 4 4 4 4 4 1 4 2 1 5 5 4 1 4 4 4 1 4 5 5 5 4 1 4 1 1 1 4 4 1 1 5 2 4 4 1
##  [408] 4 1 1 1 4 4 4 1 4 4 5 4 5 1 1 5 2 2 5 2 1 1 1 1 4 1 4 4 1 1 4 1 1 5 5 4 4
##  [445] 4 4 5 4 1 4 1 1 1 4 1 1 1 4 4 4 1 4 1 4 1 4 1 4 1 2 5 4 1 5 5 1 4 4 1 4 4
##  [482] 4 1 4 4 1 1 4 1 4 1 4 1 4 5 4 1 1 1 4 4 4 4 5 2 4 1 1 4 4 4 4 1 5 4 1 1 5
##  [519] 4 2 5 1 1 4 1 4 4 4 4 1 1 4 4 4 1 5 1 1 5 1 4 4 5 4 1 5 1 1 4 1 1 1 4 5 1
##  [556] 1 4 1 4 4 1 4 4 1 1 4 4 5 4 1 1 5 1 4 5 2 4 4 1 1 1 4 1 4 1 4 1 4 1 4 4 5
##  [593] 4 4 1 5 1 5 1 5 4 1 4 1 4 4 4 5 4 4 1 5 1 1 4 1 4 1 1 1 1 1 4 1 5 5 1 4 4
##  [630] 1 1 1 1 1 5 1 4 5 1 4 1 1 1 5 5 4 4 4 4 1 1 4 2 2 4 1 4 1 5 1 4 4 1 4 1 1
##  [667] 1 1 1 2 4 4 2 1 5 5 1 1 1 1 1 1 5 4 4 1 5 5 1 4 4 5 1 1 4 1 1 4 4 4 5 1 1
##  [704] 1 4 4 4 1 1 1 1 1 4 4 1 1 1 1 1 1 4 4 4 5 4 1 5 4 4 2 1 4 4 4 1 4 4 1 1 1
##  [741] 1 1 1 5 4 4 4 1 4 1 5 4 1 5 4 4 5 4 4 4 5 5 2 4 1 5 1 1 4 1 4 4 1 4 1 1 5
##  [778] 4 1 1 5 4 1 4 4 1 1 4 4 1 1 4 5 5 4 4 4 4 2 5 1 4 4 4 4 5 1 1 4 4 1 4 4 4
##  [815] 4 4 1 1 4 1 1 4 4 5 4 1 1 4 4 1 4 1 4 1 4 4 1 1 1 4 5 5 1 4 1 5 5 1 5 4 1
##  [852] 1 4 4 1 4 5 1 4 5 4 4 1 4 4 2 5 1 1 4 4 4 4 4 4 4 4 4 5 1 5 4 1 1 4 5 4 4
##  [889] 1 1 4 1 5 4 1 4 5 4 4 4 1 4 4 4 5 5 1 4 4 5 4 4 4 4 2 5 5 1 4 4 4 1 5 5 1
##  [926] 1 1 1 4 1 5 4 1 5 1 1 5 4 4 5 2 1 1 4 1 4 4 4 1 4 5 4 1 1 4 5 4 4 4 1 4 4
##  [963] 5 4 1 4 4 4 1 5 5 4 4 4 4 1 1 4 4 1 4 4 1 1 5 4 1 1 1 4 4 1 5 4 1 1 5 5 4
## [1000] 1 4 1 1 4 4 4 5 4 4 4 4 2 1 1 5 4 5 4 5 4 5 1 4 4 1 4 4 4 4 1 4 1 1 1 1 4
## [1037] 1 1 1 4 4 1 1 4 4 4 4 4 1 1 4 5 1 5 4 4 2 4 1 4 1 1 4 4 5 1 1 5 5 4 4 4 1
## [1074] 1 4 1 2 5 5 4 4 4 1 4 5 4 4 1 1 1 1 1 2 1 4 1 4 2 4 4 4 1 1 5 4 1 1 4 5 1
## [1111] 1 1 1 1 1 4 4 4 1 5 1 4 5 1 2 4 1 1 4 1 4 5 4 1 4 2 5 5 1 4 1 1 4 4 1 4 4
## [1148] 4 4 5 1 4 4 1 4 2 4 5 1 1 4 4 4 4 1 5 4 1 1 2 4 1 5 4 1 1 5 4 1 2 1 4 1 1
## [1185] 1 4 1 1 4 4 5 5 5 4 1 5 4 1 4 4 5 4 5 1 4 4 1 1 4 4 2 1 4 1 4 4 1 1 4 4 1
## [1222] 4 5 1 5 1 1 5 4 4 5 5 4 1 1 4 1 4 1 1 1 4 4 1 5 4 1 1 1 1 4 1 1 4 4 4 4 1
## [1259] 4 4 1 4 4 4 1 4 1 4 4 5 4 1 4 1 4 5 4 5 1 4 4 4 4 1 1 4 1 1 1 4 5 1 5 1 4
## [1296] 1 4 4 5 1 4 4 1 1 4 4 1 1 4 4 4 4 1 5 1 4 4 1 4 2 1 1 2 4 1 4 1 4 4 5 1 1
## [1333] 5 1 4 4 4 4 4 1 1 1 4 1 4 4 1 5 1 4 4 1 1 4 1 1 4 5 4 1 4 4 4 4 5 1 4 2 1
## [1370] 5 1 4 1 4 1 5 1 4 1 4 1 1 5 5 4 4 4 5 1 4 1 1 5 4 4 4 1 5 4 1 1 1 1 4 1 1
## [1407] 1 4 4 4 1 5 4 1 1 1 4 1 1 4 4 4 4 5 4 4 1 1 4 1 4 1 5 1 4 4 4 1 1 5 2 4 1
## [1444] 1 1 1 5 4 1 1 1 4 1 4 5 1 1 5 4 1 4 1 4 1 4 1 4 1 1 5 1 1 5 5 5 1 4 4 4 4
## [1481] 2 4 1 1 4 4 1 4 2 5 1 5 5 2 1 4 5 4 4 1 1 4 1 1 1 1 2 1 4 4 5 1 5 1 1 1 1
## [1518] 4 2 1 1 4 5 1 2 4 1 5 1 4 4 1 5 5 4 4 4 1 1 4 1 5 1 1 1 1 5 4 1 5 5 1 3 1
## [1555] 1 1 4 4 1 5 4 5 4 4 1 4 4 1 5 1 5 4 4 1 1 1 4 2 1 5 4 2 1 4 5 2 2 4 4 1 1
## [1592] 3 4 4 4 1 5 1 4 4 4 1 1 4 1 4 4 4 1 4 4 1 4 2 4 5 4 4 4 2 1 4 4 1 1 4 4 4
## [1629] 1 4 1 4 4 1 1 5 1 1 5 4 4 5 1 4 4 5 1 4 4 1 1 2 1 1 4 5 1 5 4 4 4 1 5 1 4
## [1666] 1 1 5 4 4 4 1 5 4 5 4 2 5 1 5 4 5 1 1 4 4 5 5 1 1 5 4 4 1 4 1 5 5 4 4 4 4
## [1703] 4 4 1 1 5 1 5 2 4 1 1 1 1 4 4 4 5 4 5 2 4 4 5 1 4 4 5 5 5 4 1 4 4 4 1 4 1
## [1740] 4 4 1 1 2 2 1 1 1 1 5 5 4 4 4 1 1 1 4 4 1 4 4 5 3 4 1 1 1 1 2 4 1 1 4 4 5
## [1777] 1 5 1 1 4 4 4 4 4 1 4 5 1 4 5 1 4 1 4 1 1 4 4 1 1 4 4 1 1 1 4 1 1 1 1 1 1
## [1814] 1 4 4 1 2 1 1 1 1 4 5 4 4 4 4 4 1 2 5 4 4 4 4 4 1 4 1 1 4 2 4 5 4 4 1 4 1
## [1851] 1 5 1 1 1 1 4 5 1 1 4 4 1 1 1 4 1 1 5 1 4 1 1 4 1 4 1 5 1 4 4 1 1 4 1 5 1
## [1888] 4 4 5 5 5 4 5 1 4 1 4 1 4 4 1 1 1 1 1 4 1 4 5 5 1 5 4 4 1 4 4 4 5 4 5 5 4
## [1925] 1 1 4 5 4 1 4 1 1 4 4 1 5 1 1 4 4 4 4 1 4 4 1 1 1 4 4 5 4 4 1 5 4 1 4 5 5
## [1962] 1 5 3 5 1 4 4 1 4 1 1 1 1 4 1 1 1 4 4 4 5 1 4 4 4 5 4 5 1 1 4 4 5 4 1 4 1
## [1999] 4 1 1 5 4 1 1 4 4 5 4 1 4 1 1 5 1 1 4 5 1 1 4 2 2 4 2 5 4 1 5 1 1 5 5 4 1
## [2036] 1 4 1 5 4 4 4 1 5 4 3 4 4 4 4 4 5 1 3 4 1 5 5 5 5 5 1 4 4 5 1 1 5 5 1 1 4
## [2073] 1 4 4 1 4 1 1 1 3 4 1 1 1 1 4 4 5 4 5 5 5 5 4 1 4 5 4 4 1 1 1 5 4 4 4 4 4
## [2110] 4 4 1 5 4 2 4 1 1 4 1 1 4 4 4 5 5 5 4 4 2 5 4 5 4 1 4 1 1 4 4 1 4 4 1 4 1
## [2147] 1 5 5 2 4 1 1 4 4 4 4 4 1 5 1 1 4 4 1 4 1 1 1 1 5 4 4 4 1 5 4 1 1 2 1 2 1
## [2184] 4 4 4 4 4 1 4 5 1 5 1 4 1 1 4 1 1 1 2 5 1 1 4 1 1 4 1 4 4 4 1 5 4 1 4 1 4
## [2221] 4 1 4 1 4 4 5 4 1 4 4 4 4 5 1 1 5 1 1 1 1 2 1 4 4 1 4 2 1 1 4 4 4 1 4 1 4
## [2258] 5 4 1 1 1 2 5 5 1 1 2 4 4 5 4 1 1 1 1 4 4 1 1 5 1 4 5 4 1 1 5 4 5 1 4 2 4
## [2295] 1 1 4 4 4 5 4 1 4 2 1 5 1 1 1 4 4 1 4 4 4 4 1 4 1 1 4 1 1 1 4 4 5 1 4 5 4
## [2332] 4 4 1 5 5 4 1 5 1 1 5 1 5 4 5 4 5 5 1 1 2 4 1 5 1 1 4 4 4 4 2 1 4 4 1 5 4
## [2369] 4 1 4 4 1 5 1 1 2 5 1 1 4 4 4 1 1 4 5 4 1 5 5 1 2 1 5 4 4 1 1 1 4 4 4 4 5
## [2406] 5 4 1 1 1 4 4 4 4 4 1 5 4 4 4 1 4 1 2 1 4 1 1 4 5 1 4 5 1 1 1 4 1 1 4 4 1
## [2443] 4 5 4 4 5 4 4 4 4 4 1 5 1 4 1 2 4 1 4 4 2 4 1 1 5 4 5 4 1 4 5 4 5 5 1 4 1
## [2480] 1 5 1 5 1 5 5 5 1 4 4 1 1 1 4 1 1 4 1 4 4 4 4 2 1 1 1 4 1 1 5 1 1 4 1 5 5
## [2517] 4 3 1 4 4 5 1 4 4 5 4 5 1 1 4 4 4 4 4 1 1 4 1 1 4 4 2 1 5 4 1 5 5 1 1 1 1
## [2554] 1 5 1 1 4 4 4 1 4 1 4 5 4 1 1 1 4 4 4 4 4 5 5 4 4 4 1 1 1 4 4 4 4 4 1 1 1
## [2591] 1 1 1 1 4 4 4 1 1 4 4 4 4 4 4 4 5 4 4 4 4 4 4 1 1 1 4 2 4 5 4 1 4 1 5 5 1
## [2628] 1 4 4 1 2 4 1 4 1 5 4 4 1 2 4 4 1 4 1 4 1 4 4 1 5 4 1 5 4 1 1 1 4 5 3 1 5
## [2665] 4 5 1 4 1 4 1 1 4 4 5 4 5 1 4 4 1 1 1 5 4 5 1 5 4 4 1 4 5 1 1 5 4 4 1 4 4
## [2702] 4 4 1 5 4 4 4 1 5 5 4 1 5 1 5 1 2 4 4 5 1 5 1 4 4 4 5 5 4 4 4 1 4 1 4 4 4
## [2739] 4 5 1 1 4 4 4 5 1 4 1 4 4 1 1 4 1 1 4 4 4 1 1 5 1 4 1 1 1 1 1 4 1 5 5 4 1
## [2776] 4 5 1 4 1 1 4 4 4 1 4 4 1 4 4 1 1 5 2 1 4 4 2 5 4 4 5 4 4 3 5 4 4 5 4 5 4
## [2813] 1 4 4 1 1 1 4 4 1 5 1 4 1 4 1 4 1 1 4 5 5 5 1 4 1 1 4 1 4 4 1 4 1 4 1 1 1
## [2850] 1 4 4 4 5 1 5 1 4 4 1 1 5 1 4 1 1 1 5 1 4 4 4 1 1 5 1 4 1 5 4 5 1 4 4 4 4
## [2887] 4 1 1 1 2 1 1 4 1 1 5 4 4 1 4 1 5 2 1 4 5 4 1 1 1 1 4 5 1 1 4 1 4 4 4 2 4
## [2924] 1 1 4 1 5 1 1 4 4 5 4 1 4 1 4 4 1 1 4 4 4 1 5 4 1 4 4 1 1 5 5 1 5 1 4 1 4
## [2961] 1 4 1 5 1 5 4 1 5 4 4 5 5 5 5 5 1 1 4 1 4 4 4 1 5 1 4 1 5 1 2 1 4 4 1 1 4
## [2998] 4 4 4 1 1 1 5 1 2 3 1 1 5 4 4 4 1 4 4 4 4 4 2 4 5 1 4 5 4 1 1 5 4 1 4 4 1
## [3035] 1 4 1 4 4 4 1 1 5 1 4 4 4 4 1 1 5 4 4 4 1 4 2 4 5 5 1 1 5 1 5 4 5 4 2 1 5
## [3072] 4 1 1 1 4 5 1 1 5 1 4 1 1 1 4 5 5 4 4 4 1 1 5 5 1 1 1 1 4 4 4 4 5 5 1 4 5
## [3109] 1 4 4 1 1 1 1 1 1 1 1 1 1 4 1 4 4 1 5 1 4 5 1 1 4 1 1 1 1 1 1 2 1 5 1 1 1
## [3146] 4 4 1 4 4 1 1 4 4 1 1 4 5 4 1 4 1 1 4 1 2 2 1 4 5 1 4 4 2 1 4 5 1 1 1 4 4
## [3183] 4 1 2 4 5 4 4 1 4 1 1 1 1 1 4 5 2 1 4 1 1 4 1 4 4 4 1 4 1 4 1 1 1 4 1 4 5
## [3220] 4 4 4 1 4 1 5 4 1 2 4 1 1 5 1 2 4 4 5 1 1 1 1 4 1 5 1 1 1 4 2 1 4 1 5 5 1
## [3257] 4 1 2 2 1 4 1 5 1 4 4 1 2 4 1 4 5 1 4 5 5 1 5 5 4 1 1 1 1 5 1 4 1 2 4 4 4
## [3294] 5 1 4 4 4 4 1 1 4 4 1 4 4 5 4 4 4 1 5 3 1 4 4 4 4 4 5 4 4 1 4 1 1 1 4 1 4
## [3331] 1 4 2 1 1 1 1 4 5 5 4 1 4 4 4 4 5 4 2 4 1 4 1 1 5 4 1 1 4 5 4 1 4 1 4 4 1
## [3368] 5 4 1 4 5 4 4 4 5 1 5 1 1 4 4 5 4 2 1 4 1 4 1 2 1 5 1 1 2 4 4 5 1 3 1 1 4
## [3405] 1 1 1 4 2 5 5 4 1 5 4 4 5 1 5 1 5 1 4 1 1 4 1 1 4 4 4 4 1 1 5 5 4 4 4 4 4
## [3442] 4 4 1 4 1 1 1 1 4 4 4 5 4 4 1 5 4 4 4 5 4 4 4 5 1 4 1 2 1 4 1 4 4 4 2 1 1
## [3479] 4 1 4 4 4 1 1 1 2 1 4 4 1 1 1 2 1 4 4 4 4 5 1 4 4 1 4 1 4 4 1 4 4 1 1 4 4
## [3516] 4 4 1 5 1 4 1 5 4 1 1 4 4 1 5 4 5 4 1 1 1 2 4 4 4 1 1 1 4 4 4 1 4 1 4 4 5
## [3553] 1 1 5 5 4 5 1 4 4 4 5 4 1 5 4 4 1 5 4 1 4 5 1 5 4 4 4 2 4 1 4 1 4 1 5 4 4
## [3590] 1 1 4 1 5 1 1 4 4 1 1 1 4 2 5 1 1 1 1 1 4 1 1 4 1 5 4 4 4 1 1 1 4 1 1 4 1
## [3627] 1 4 4 4 1 1 5 4 1 4 4 4 4 1 4 5 1 5 1 4 1 5 1 4 1 4 1 1 4 1 5 1 1 5 4 1 5
## [3664] 1 1 4 2 4 4 4 1 4 5 5 1 4 4 5 5 1 4 5 4 1 4 1 1 4 1 3 1 4 4 4 4 1 1 1 1 4
## [3701] 2 1 4 5 1 1 1 5 1 1 4 4 4 4 1 4 4 1 1 4 4 1 4 4 5 4 5 2 1 4 4 5 1 4 4 1 4
## [3738] 4 4 4 1 1 5 1 1 4 1 1 1 4 4 4 1 1 1 4 4 1 2 4 1 5 1 4 1 4 4 4 1 4 1 1 1 1
## [3775] 4 1 4 1 4 4 1 1 4 4 1 1 5 1 4 4 4 1 5 4 5 5 5 1 1 4 1 4 5 1 1 5 5 1 5 1 1
## [3812] 4 5 1 5 4 5 1 4 1 1 4 4 5 1 4 4 5 4 1 1 2 1 1 1 1 1 1 4 1 4 1 4 4 5 1 4 4
## [3849] 1 5 1 5 1 1 4 1 4 4 5 4 1 4 1 5 1 1 4 4 4 1 4 4 4 5 4 1 4 5 2 1 4 1 1 4 4
## [3886] 4 2 4 4 4 4 1 4 4 4 1 1 5 1 4 4 4 5 5 1 5 1 4 5 4 1 4 5 1 4 5 4 4 1 5 4 4
## [3923] 1 5 5 4 4 1 1 1 5 4 4 4 4 1 1 4 4 4 1 5 1 1 1 1 4 5 2 1 1 1 4 1 4 4 4 1 5
## [3960] 1 4 1 4 1 5 4 4 5 4 1 5 4 5 4 2 4 4 4 1 1 4 4 1 1 4 1 4 1 1 4 5 1 4 5 4 1
## [3997] 1 1 5 1 4 4 1 2 1 4 1 4 5 1 4 4 1 4 4 4 1 4 1 3 1 1 4 5 4 4 4 1 1 4 4 1 5
## [4034] 1 1 1 4 4 4 4 4 5 4 4 1 4 4 4 1 4 5 1 4 1 1 1 4 4 1 4 4 1 4 1 1 1 1 4 1 2
## [4071] 4 4 1 5 4 4 1 1 4 4 4 1 1 5 4 4 4 4 5 4 1 1 1 2 4 1 4 1 4 4 1 4 4 1 1 4 5
## [4108] 1 4 1 4 1 1 1 4 1 4 4 5 4 5 4 5 1 4 1 5 4 1 1 5 4 1 1 4 4 4 1 1 2 4 4 4 4
## [4145] 5 4 1 1 4 4 1 4 4 5 5 1 1 1 5 1 4 4 1 4 4 4 4 4 5 1 4 4 1 4 1 1 1 4 4 4 4
## [4182] 2 2 4 1 5 5 4 1 4 4 5 2 4 5 5 1 1 4 5 4 1 5 4 4 4 4 4 4 5 4 4 5 1 4 4 1 1
## [4219] 1 4 1 2 4 4 4 4 2 1 1 1 5 3 5 1 4 1 4 5 4 5 1 4 4 4 4 4 1 4 1 5 1 5 1 1 1
## [4256] 4 1 4 1 5 4 4 2 5 1 4 1 2 1 5 4 4 1 1 1 1 5 4 5 2 5 5 1 4 5 1 1 1 4 5 1 1
## [4293] 4 1 1 1 1 4 4 4 4 4 4 1 1 4 4 5 5 4 4 1 4 1 1 1 4 5 1 4 5 1 5 4 4 4 1 4 1
## [4330] 5 4 1 5 4 1 4 5 2 1 4 1 2 1 1 5 4 1 4 4 4 1 3 5 4 1 1 1 4 1 4 4 5 4 1 1 4
## [4367] 5 4 4 5 4 5 1 5 1 1 4 4 4 1 5 1 4 4 4 2 5 4 5 5 4 4 5 1 4 4 5 1 1 1 4 1 1
## [4404] 4 1 5 4 1 5 5 1 4 5 1 1 5 1 4 1 1 4 1 4 1 4 1 5 1 4 1 2 4 4 4 4 1 4 1 4 4
## [4441] 5 4 1 1 5 4 5 4 1 4 1 1 4 1 4 5 4 5 4 1 1 5 1 1 1 4 1 4 5 5 2 4 5 4 4 4 1
## [4478] 4 4 5 1 1 1 4 1 4 1 5 1 4 1 5 1 1 4 1 4 4 4 4 4 4 4 1 4 4 4 4 4 5 5 5 4 5
## [4515] 1 1 1 1 1 4 4 1 1 1 1 1 1 1 1 4 4 1 4 1 1 2 5 5 5 3 4 4 5 1 4 2 2 1 4 1 4
## [4552] 4 1 1 1 1 1 4 4 5 2 4 4 1 1 1 4 4 4 1 4 4 1 5 4 4 4 1 4 4 5 1 1 4 4 1 4 1
## [4589] 4 1 1 4 1 1 4 5 5 4 4 2 3 4 4 5 4 5 4 4 5 4 1 4 5 5 1 1 1 5 4 1 1 4 4 5 4
## [4626] 1 5 1 4 4 4 5 4 1 1 4 4 4 1 1 4 5 1 5 5 1 4 1 4 4 4 1 4 4 1 5 5 4 4 5 4 1
## [4663] 4 4 4 1 1 1 1 1 4 5 4 1 1 4 1 4 5 1 5 5 1 4 1 1 1 4 4 4 4 5 4 1 5 4 1 4 1
## [4700] 4 5 1 1 1 4 4 4 1 4 1 1 1 5 1 4 4 1 5 4 5 1 4 5 1 1 4 1 2 1 1 4 1 1 4 1 5
## [4737] 4 4 4 1 4 1 1 4 4 1 5 5 4 1 5 4 4 1 4 4 5 2 3 1 1 5 4 4 4 5 4 5 1 1 1 1 4
## [4774] 1 5 1 1 3 1 5 4 4 4 4 4 5 2 4 5 4 1 1 4 5 4 2 1 5 5 5 5 1 2 4 4 4 1 5 5 1
## [4811] 4 1 4 1 1 4 4 5 4 4 1 2 5 1 1 1 4 1 1 1 1 5 5 5 1 4 4 2 2 4 4 4 5 2 4 4 4
## [4848] 4 4 1 4 1 4 4 4 4 1 1 1 4 5 1 2 4 2 5 5 1 1 1 5 1 2 1 5 1 1 1 5 5 1 4 1 1
## [4885] 4 4 1 4 1 1 1 4 1 1 4 4 1 1 5 4 4 4 5 1 4 1 5 1 4 1 1 4 4 1 2 4 4 1 4 2 1
## [4922] 1 4 4 2 1 4 4 4 1 4 4 1 1 4 4 4 1 1 4 4 1 1 5 4 5 4 5 1 3 1 5 4 4 1 1 1 4
## [4959] 1 4 1 4 1 1 1 4 5 1 1 4 1 1 4 1 1 2 5 4 5 1 4 4 4 4 4 4 1 4 5 1 1 4 4 5 5
## [4996] 4 5 1 4 4
## 
## Within cluster sum of squares by cluster:
## [1] 2711.7616  599.2561  512.7381 2005.3851 1455.3231
##  (between_SS / total_SS =  58.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Visualize clusters
fviz_cluster(clusters,cluster_sample,ellipse.type="norm",geom="point")

#determine which variables are driving the cluster creation

# profile clusters
cluster_sample$cluster<-as.factor(clusters$cluster)
for(col in colnames(cluster_sample%>% select_if(is.factor))){
  cluster_sample%>% ggplot(aes(!!as.name(col)))+geom_bar()->p
  print(p)
  
  cluster_sample%>% ggplot(aes(!!as.name(col)))+geom_bar()+facet_wrap(~clusters$cluster)->p
  print(p)}

for(col in colnames(cluster_sample%>% select_if(is.numeric))){
  cluster_sample%>%
    ggplot(aes(!!as.name(col)))+geom_histogram()->p
  print(p)
  cluster_sample%>%
    ggplot(aes(!!as.name(col)))+geom_histogram()+ facet_wrap(~clusters$cluster)->p
  print(p)}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Prediction

Data Transformation

data <-donor %>% dplyr::select(-projectid,-school_latitude,-school_longitude,-teacher_acctid,-schoolid,-school_ncesid,-school_city,-school_state,-school_zip,-school_metro,-school_district,-school_county,-primary_focus_subject,-secondary_focus_area)%>%
  
  mutate_if(is.character,factor)%>%
  mutate(is_exciting= as.factor(is_exciting))
data$one_non_teacher_referred_donor_g <- as.factor(data$one_non_teacher_referred_donor_g)
data$school_charter <- as.factor(data$school_charter)
data$school_magnet <- as.factor(data$school_magnet)
data$school_year_round <- as.factor(data$school_year_round)
data$school_nlns <- as.factor(data$school_nlns)
data$school_kipp <- as.factor(data$school_kipp)
data$school_charter_ready_promise <- as.factor(data$school_charter_ready_promise)
data$teacher_teach_for_america<- as.factor(data$teacher_teach_for_america)
data$teacher_ny_teaching_fellow <- as.factor(data$teacher_ny_teaching_fellow)
data$eligible_double_your_impact_matc<- as.factor(data$eligible_double_your_impact_matc)
data$eligible_almost_home_match <- as.factor(data$eligible_almost_home_match)

head(data)

Partition Data

data_sample <- data %>% sample_n(100000)
set.seed(1200)
split <- initial_split(data_sample, prop = 0.75)

train <- training(split)
test <- testing(split)

sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(data_sample) * 100)
## [1] "Train PCT : 75.00%"
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(data_sample) * 100)
## [1] "Test PCT : 25.00%"
train

We begin with the model with full variables.

Full Recipe

donor_recipe <- recipe(is_exciting ~.,data = train) %>%
  step_impute_median(all_numeric_predictors()) %>% # missing values numeric 
  step_novel(all_nominal_predictors()) %>% # new factor levels 
  step_unknown(all_nominal_predictors()) %>% # missing values
  step_other(all_nominal_predictors(),threshold = 0.01) %>%  # pool rarely occuring levels 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_nzv(all_predictors())%>%  prep()
# -- apply the recipe 
bake_train <- bake(donor_recipe, new_data = train)
bake_test  <- bake(donor_recipe, new_data = test)

Logistic Regression

log_model <-logistic_reg(mode = "classification") %>%
                  set_engine("glm") %>%
                  fit(is_exciting ~ ., data = bake_train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(log_model) %>%
  mutate_at(c("estimate", "std.error", "statistic", "p.value"),round, 4)
## new model with significant vars
log_model2 <-logistic_reg(mode = "classification") %>%
                  set_engine("glm") %>%
                  fit(is_exciting ~ great_messages_proportion + teacher_referred_count + non_teacher_referred_count + fulfillment_labor_materials + total_price_excluding_optional_s+total_price_including_optional_s+days_from_now+ teacher_teach_for_america_FALSE.  +teacher_prefix_Mr.+ one_non_teacher_referred_donor_g_FALSE. +  eligible_double_your_impact_matc_FALSE.+ teacher_prefix_Mrs.+resource_type_Technology+   eligible_almost_home_match_FALSE.+eligible_double_your_impact_matc_FALSE., data = bake_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(log_model2) %>%
  mutate_at(c("estimate", "std.error", "statistic", "p.value"),round, 4)
# -- training predictions from new logistic model
predict(log_model2, bake_train, type = "prob") %>%
  bind_cols(.,predict(log_model2, bake_train)) %>%
  bind_cols(.,bake_train) -> scored_train_log_model2

head( scored_train_log_model2)
# -- testing predictions from new logistic model
predict(log_model2, bake_test, type = "prob") %>%
  bind_cols(.,predict(log_model2, bake_test)) %>%
  bind_cols(.,bake_test) -> scored_test_log_model2

head(scored_test_log_model2)
# -- AUC: Train and Test 
scored_train_log_model2 %>% 
  metrics(is_exciting, .pred_FALSE, estimate = .pred_class) %>%
  mutate(part="training") %>%
  bind_rows( scored_test_log_model2 %>% 
               metrics(is_exciting, .pred_FALSE, estimate = .pred_class) %>%
               mutate(part="testing") 
  )
# precision and recall
  scored_train_log_model2 %>% 
    precision(is_exciting, .pred_class, event_level = 'second') %>%
    mutate(part="training") %>%
    bind_rows( scored_test_log_model2 %>% 
                 precision(is_exciting,  .pred_class, event_level = 'second') %>%
                 mutate(part="testing") )  %>% print()
## # A tibble: 2 × 4
##   .metric   .estimator .estimate part    
##   <chr>     <chr>          <dbl> <chr>   
## 1 precision binary         0.718 training
## 2 precision binary         0.702 testing
scored_train_log_model2 %>% 
    recall(is_exciting, .pred_class) %>%
    mutate(part="training") %>%
    bind_rows( scored_test_log_model2 %>% 
                 recall(is_exciting,  .pred_class) %>%
                 mutate(part="testing") )  %>% print()
## # A tibble: 2 × 4
##   .metric .estimator .estimate part    
##   <chr>   <chr>          <dbl> <chr>   
## 1 recall  binary         0.981 training
## 2 recall  binary         0.979 testing
# -- ROC Charts 
scored_train_log_model2 %>%
  mutate(model = "train") %>%
  bind_rows(scored_test_log_model2 %>%
              mutate(model="test")) %>%
  group_by(model) %>%
  roc_curve(is_exciting, .pred_FALSE) %>%
  autoplot()

## confusion matrix
scored_train_log_model2 %>%
  conf_mat(is_exciting, .pred_class) %>%
  autoplot( type = "heatmap") +
  labs(title="Train Confusion Matrix")

scored_test_log_model2 %>%
  conf_mat(is_exciting, .pred_class) %>%
  autoplot( type = "heatmap") +
  labs(title="Test Confusion Matrix")

log_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

log_workflow <- workflow() %>%
  add_recipe(donor_recipe) %>%
  add_model(log_model) %>%
  fit(train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
log_workflow %>%
  extract_fit_parsnip() %>%
  tidy()%>%
  mutate(across(is.numeric,round,3))
## Warning: Predicate functions must be wrapped in `where()`.
## 
##   # Bad
##   data %>% select(is.numeric)
## 
##   # Good
##   data %>% select(where(is.numeric))
## 
## ℹ Please update your code.
## This message is displayed once per session.

Random Forest

rf_model <- rand_forest(trees=100, min_n=10) %>%
  set_mode("classification") %>%
  set_engine("ranger", importance="impurity")

rf_workflow <-workflow() %>%
  add_recipe(donor_recipe) %>%
  add_model(rf_model)%>%
  fit(train)

# -- score testing 
  predict(rf_workflow, test, type="prob") %>%
      bind_cols(predict(rf_workflow,  test, type="class")) %>%
       bind_cols(., test) -> scored_test 
  predict(rf_workflow, train, type="prob") %>%
      bind_cols(predict(rf_workflow,  train, type="class")) %>%
       bind_cols(., train) -> scored_train

#evaluation

options(yardstick.event_first = FALSE)
test_score <-predict(rf_workflow, test, type="prob") %>%
bind_cols(predict(rf_workflow, test, type="class")) %>%
bind_cols(test)
  
# -- metrics -- 
test_score %>%
  metrics(is_exciting, .pred_TRUE, estimate = .pred_class)                                                                
## Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
## Instead, set the following argument directly in the metric function:
## `options(yardstick.event_first = TRUE)`  -> `event_level = 'first'` (the default)
## `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
## This warning is displayed once per session.
# -- roc curve plot -- 
test_score %>%
 roc_curve(is_exciting, .pred_TRUE) %>%
  autoplot()

# -- calculate operating range -- 
test_score %>%
 roc_curve(is_exciting, .pred_TRUE) %>%
  mutate(fpr = round((1 - specificity),2),
         tpr = round(sensitivity,3),
         score_threshold = round(.threshold,3)) %>%
  group_by(fpr) %>%
  summarise(threshold = max(score_threshold),
            tpr = max(tpr))%>%
filter(fpr >= 0.01 & fpr <= 0.10)
# -- roc curve at the FPR operating range -- 
test_score %>%
 roc_curve(is_exciting, .pred_TRUE) %>%
  autoplot() +
  geom_vline(aes(xintercept=0.05, color="red")) +
  labs(title="ROC operating at 5% FPR")

# -- calculate precision -- 
test_score %>%
  mutate(.pred_class = if_else(.pred_TRUE >= 0.5,1,0)) %>%
  mutate(.pred_class = as.factor(.pred_class))
# -- Confustion Matricies  
test_score %>%
  conf_mat(is_exciting, .pred_class) %>%
  autoplot( type = "heatmap") +
  labs(title="Test Confusion Matrix")

options(yardstick.event_first = FALSE)
train_score <-predict(rf_workflow, train, type="prob") %>%
bind_cols(predict(rf_workflow, train, type="class")) %>%
bind_cols(train)
  
# -- metrics -- 
train_score %>%
  metrics(is_exciting, .pred_TRUE, estimate = .pred_class)                                                                
# -- roc curve plot -- 
train_score %>%
 roc_curve(is_exciting, .pred_TRUE) %>%
  autoplot()

# -- calculate operating range -- 
train_score %>%
 roc_curve(is_exciting, .pred_TRUE) %>%
  mutate(fpr = round((1 - specificity),2),
         tpr = round(sensitivity,3),
         score_threshold = round(.threshold,3)) %>%
  group_by(fpr) %>%
  summarise(threshold = max(score_threshold),
            tpr = max(tpr))%>%
filter(fpr >= 0.01 & fpr <= 0.10)
# -- roc curve at the FPR operating range -- 
train_score %>%
 roc_curve(is_exciting, .pred_TRUE) %>%
  autoplot() +
  geom_vline(aes(xintercept=0.05, color="red")) +
  labs(title="ROC operating at 5% FPR")

# -- calculate precision -- 
train_score %>%
  mutate(.pred_class = if_else(.pred_TRUE >= 0.5,1,0)) %>%
  mutate(.pred_class = as.factor(.pred_class))
# -- Confustion Matricies  
train_score %>%
  conf_mat(is_exciting, .pred_class) %>%
  autoplot( type = "heatmap") +
  labs(title="Train Confusion Matrix")

## VIP 
rf_workflow %>%
  pull_workflow_fit()%>%
  vip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.